Below contains information on five features: credit score, debt-to-income ratio, loan-to-value ratio, median household income at the 3-digit zip code-level, and the dollar amount change in mortgage loans made 1 year ago and 5 years ago. The latter feature was created by taking total loan amount during a fiscal year quarter for each bank within a 3-digit zip code.
Using saved models, each of the five features were one-by-one replaced by an improved and a weakened assumption based on the inter-quartile range (25-75 percentiles) of the feature across all banks and predicted probabilities were generated to see what the expected foreclosure rate would be if each banks’ behavior were different. For example, a high credit score is associated with fewer foreclosures. Among all banks, the average credit score was 719 (on a scale of 300 to 850). I modified the credit score at each bank to the 75th percentile—an improved assumption of a 770 credit score—and to the 25th percentile—a weakened assumption of 675 credit score. I left all other feature values unchanged. I ran these values through the saved model detailed in the section above and analyzed the change in foreclosure rates. One can interpret the findings as: “If GMAC Mortgage only lent to those with a credit score of 770, with all other considerations staying the same, its foreclosure rate is predicted to fall from 9.7% to 1%.â€
# Load functions
%run Functions.ipynb
pd.set_option("display.max_columns", 200)
pd.set_option('display.max_rows', 200)
# Load data
file_to_open = open('..\Data\Pickle\df.pkl', 'rb')
df = pickle.load(file_to_open)
file_to_open.close()
# Drop mergeID column
df = df.drop(labels='Loan ID', axis=1)
# Convert Inf values to NA
df = df.replace([np.inf, -np.inf], np.nan)
## Bank and Classifier Lists
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
'AmTrust Bank','PNC Bank','Flagstar Bank']
banks_plus = banks + ['All Banks']
clfs_str = ['RFC', 'RFC PCA', 'RUS Boost']
# Rename Columns
df = df.rename(columns={"Original Combined Loan-to-Value (CLTV)": "Loan-to-Value (LTV)",
"Original Debt to Income Ratio": "Debt-to-Income",
"Loan Change (1 Year)": "Loan Change (1 Yr)",
"Loan Change (5 Years)": "Loan Change (5 Yr)",
"Lnlsnet (1 Yr)": "Loan Liabilities (1 Yr)",
"Lnlsnet (5 Yr)": "Loan Liabilities (5 Yr)"})
## Create an environment variable to avoid using the GPU. This can be changed.
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
# Verify Bank Counts
df['Bank'].value_counts()
# Variables to drop
dropvars = ['File Year', 'Year', 'Month', 'Region', 'FIPS',
'Zip Code', 'Mortgage Insurance Type', 'Property State',
'First Payment', 'Original Loan-to-Value (LTV)']
df = df.drop(labels=dropvars, axis=1)
df = df.filter(regex=r'^(?!Asset).*$')
df = df.filter(regex=r'^(?!Liab).*$')
df = df.filter(regex=r'^(?!Eqtot).*$')
df = df.filter(regex=r'^(?!Dep).*$')
# Convert Original Date to Numeric
df['Reported Period'] = df['Reported Period'].astype(float).astype(int).astype(str)
df['Reported Period'] = df['Reported Period'].apply(lambda x: x.zfill(6))
df['Reported Period'] = df['Reported Period'].map(lambda x: x[:2] + '/' + x[2:])
df = change_date(df, 'Reported Period')
df = change_date(df, 'Original Date')
# Missingness to drop
df = df.dropna()
# All data
y_all = df['Foreclosed']
X_all = df.drop(labels=['Foreclosed', 'Zero Balance Code'], axis=1)
# Split Train (70%)
X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.7,
stratify = y_all, random_state=2019)
# Split Val (15%) and Test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5,
stratify = y_test, random_state=2019)
# One hot encoding on remaining data
Bnk_train = X_train['Bank'].reset_index().iloc[:,1]
X_train = onehotencoding(X_train)
Bnk_val = X_val['Bank'].reset_index().iloc[:,1]
X_val = onehotencoding(X_val)
Bnk_test = X_test['Bank'].reset_index().iloc[:,1]
X_test = onehotencoding(X_test)
# Update Macroeconomic variables (will not use test set)
X_train, X_val, X_test = pca_fred(X_train, X_val, X_test)
# Check columns
X_train.columns
# List of banks
banks = ['Bank of America','Wells Fargo Bank','CitiMortgage',
'JPMorgan Chase','GMAC Mortgage','SunTrust Mortgage',
'AmTrust Bank','PNC Bank','Flagstar Bank']
# Run Function
Banks_X, Banks_y = Bank_Subsets(banks, df_X = X_train, df_y = y_train)
Banks_X_val, Banks_y_val = Bank_Subsets(banks, df_X = X_val, df_y = y_val)
Banks_X_test, Banks_y_test = Bank_Subsets(banks, df_X = X_test, df_y = y_test)
X_train = X_train.filter(regex=r'^(?!Bank).*$')
X_val = X_val.filter(regex=r'^(?!Bank).*$')
X_test = X_test.filter(regex=r'^(?!Bank).*$')
# All Banks
Banks_y['All Banks'] = y_train
Banks_X['All Banks'] = X_train
Banks_y_val['All Banks'] = y_val
Banks_X_val['All Banks'] = X_val
Banks_y_test['All Banks'] = y_test
Banks_X_test['All Banks'] = X_test
print('Shape:', X_train.shape)
# Loading models
file_to_open = open('..\Data\Pickle\models.pkl', 'rb')
vote_models = pickle.load(file_to_open)
file_to_open.close()
# Loading Thresholds
file_to_open = open('..\Data\Pickle\model_thresholds.pkl', 'rb')
vote_thresholds = pickle.load(file_to_open)
file_to_open.close()
# Combine Train, Validation, and Testing Data
X = pd.concat([X_train, X_val, X_test], axis=0).reset_index().iloc[:,1:]
y = pd.concat([y_train, y_val, y_test], axis=0).reset_index().iloc[:,1]
bank_idx = pd.concat([Bnk_train, Bnk_val, Bnk_test], axis=0).reset_index().iloc[:,1]
# Initiate Dictionaries
better = {}
better_value = {}
best = {}
best_value = {}
worse = {}
worse_value = {}
worst = {}
worst_value = {}
# Credit Score
print('Credit Score Distribution')
print(X['Credit Score'].describe().round(0))
print('')
better['Credit Score'], \
better_value['Credit Score'] = changing_assumptions(
'Credit Score', 75,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Debt-to-Income
print('Debt-to-Income Distribution')
print(X['Debt-to-Income'].describe().round(0))
print('')
better['Debt-to-Income'], \
better_value['Debt-to-Income'] = changing_assumptions('Debt-to-Income', 25,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test)
# Loan to Value
print('Loan-to-Value Distribution')
print(X['Loan-to-Value (LTV)'].describe().round(0))
print('')
better['Loan-to-Value (LTV)'], \
better_value['Loan-to-Value (LTV)'] = changing_assumptions(
'Loan-to-Value (LTV)', 25,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Median Household Income
print('Median Household Income Distribution')
print(X['Median Household Income'].describe().round(2))
print('')
better['Median Household Income'], \
better_value['Median Household Income'] = changing_assumptions(
'Median Household Income', 75,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Median Household Income (Best Assumption)
print('Median Household Income Distribution')
print(X['Median Household Income'].describe().round(2))
print('')
best['Median Household Income'], \
best_value['Median Household Income'] = changing_assumptions(
'Median Household Income', 100,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Loan Change
print('Loan Change (1 Yr) Distribution')
print(X['Loan Change (1 Yr)'].describe().round(2))
print('')
print('Loan Change (5 Yr) Distribution')
print(X['Loan Change (5 Yr)'].describe().round(2))
print('')
better['Loan Change (1 Yr)'], \
better_value['Loan Change (1 Yr)']= changing_assumptions(
['Loan Change (1 Yr)', 'Loan Change (5 Yr)'], [25, 25],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Loan Change
print('Loan Change (1 Yr) Distribution')
print(X['Loan Change (1 Yr)'].describe().round(2))
print('')
print('Loan Change (5 Yr) Distribution')
print(X['Loan Change (5 Yr)'].describe().round(2))
print('')
better['Loan Change (5 Yr)'], \
better_value['Loan Change (5 Yr)']= changing_assumptions(
['Loan Change (1 Yr)', 'Loan Change (5 Yr)'], [25, 25],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Bank Loan Liabilities
print('Bank Loan Liabilities (1 Year) Distribution')
print(X['Loan Liabilities (1 Yr)'].describe().round(2))
print('')
print('Bank Loan Liabilities (5 Years) Distribution')
print(X['Loan Liabilities (5 Yr)'].describe().round(2))
print('')
better['Loan Liabilities (1 Yr)'], \
better_value['Loan Liabilities (1 Yr)'] = changing_assumptions(
['Loan Liabilities (1 Yr)', 'Loan Liabilities (5 Yr)'], [25, 25],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Bank Loan Liabilities (Best Assumption)
print('Bank Loan Liabilities (1 Year) Distribution')
print(X['Loan Liabilities (1 Yr)'].describe().round(2))
print('')
print('Bank Loan Liabilities (5 Years) Distribution')
print(X['Loan Liabilities (5 Yr)'].describe().round(2))
print('')
best['Loan Liabilities (1 Yr)'], \
best_value['Loan Liabilities (1 Yr)'] = changing_assumptions(
['Loan Liabilities (1 Yr)', 'Loan Liabilities (5 Yr)'], [100, 100],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Save improved assumptions
data = [better, better_value, best, best_value]
with open("..\Data\Pickle\pred_votes_improved.pkl", "wb") as f:
pickle.dump(data, f)
# Credit Score
print('Credit Score Distribution')
print(X['Credit Score'].describe().round(0))
print('')
worse['Credit Score'], \
worse_value['Credit Score'] = changing_assumptions(
'Credit Score', 25,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Debt-to-Income
print('Debt-to-Income Distribution')
print(X['Debt-to-Income'].describe().round(0))
print('')
worse['Debt-to-Income'], \
worse_value['Debt-to-Income'] = changing_assumptions(
'Debt-to-Income', 75,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Loan-to-Value
print('Loan-to-Value Distribution')
print(X['Loan-to-Value (LTV)'].describe().round(0))
print('')
worse['Loan-to-Value (LTV)'], \
worse_value['Loan-to-Value (LTV)'] = changing_assumptions(
'Loan-to-Value (LTV)', 75,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Median Household Income
print('Median Household Income Distribution')
print(X['Median Household Income'].describe().round(2))
print('')
worse['Median Household Income'], \
worse_value['Median Household Income'] = changing_assumptions(
'Median Household Income', 25,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Median Household Income (Worst Assumption)
print('Median Household Income Distribution')
print(X['Median Household Income'].describe().round(2))
print('')
worst['Median Household Income'], \
worst_value['Median Household Income'] = changing_assumptions(
'Median Household Income', 0,
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Loan Change
print('Loan Change (1 Yr) Distribution')
print(X['Loan Change (1 Yr)'].describe().round(2))
print('')
print('Loan Change (5 Yr) Distribution')
print(X['Loan Change (5 Yr)'].describe().round(2))
print('')
worse['Loan Change (1 Yr)'], \
worse_value['Loan Change (1 Yr)'] = changing_assumptions(
['Loan Change (1 Yr)', 'Loan Change (5 Yr)'], [75, 75],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Loan Change
print('Loan Change (1 Yr) Distribution')
print(X['Loan Change (1 Yr)'].describe().round(2))
print('')
print('Loan Change (5 Yr) Distribution')
print(X['Loan Change (5 Yr)'].describe().round(2))
print('')
worse['Loan Change (5 Yr)'], \
worse_value['Loan Change (5 Yr)'] = changing_assumptions(
['Loan Change (1 Yr)', 'Loan Change (5 Yr)'], [75, 75],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Bank Loan Liabilities
print('Bank Loan Liabilities (1 Year) Distribution')
print(X['Loan Liabilities (1 Yr)'].describe().round(2))
print('')
print('Bank Loan Liabilities (5 Years) Distribution')
print(X['Loan Liabilities (5 Yr)'].describe().round(2))
print('')
worse['Loan Liabilities (1 Yr)'], \
worse_value['Loan Liabilities (1 Yr)'] = changing_assumptions(
['Loan Liabilities (1 Yr)', 'Loan Liabilities (5 Yr)'], [75, 75],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Bank Loan Liabilities (Worst Assumption)
print('Bank Loan Liabilities (1 Year) Distribution')
print(X['Loan Liabilities (1 Yr)'].describe().round(2))
print('')
print('Bank Loan Liabilities (5 Years) Distribution')
print(X['Loan Liabilities (5 Yr)'].describe().round(2))
print('')
worst['Loan Liabilities (1 Yr)'], \
worst_value['Loan Liabilities (1 Yr)'] = changing_assumptions(
['Loan Liabilities (1 Yr)', 'Loan Liabilities (5 Yr)'], [0, 0],
banks, bank_idx, X,
vote_models, vote_thresholds,
Banks_X, Banks_X_val, Banks_X_test,
Banks_y, Banks_y_val, Banks_y_test
)
# Save weakened assumptions
data = [worse, worse_value, worst, worst_value]
with open("..\Data\Pickle\pred_votes_weakened.pkl", "wb") as f:
pickle.dump(data, f)